'''
https://www.raspberrypi.org
water
2017-12-01
down raspberrypi detail
'''

import os
import codecs
import requests

root = "D:/Pi"
home = "https://www.raspberrypi.org/"
default = "products/"

def find_str_between(l, s_s, s_e):
    if l.find(s_s) == -1:
        return ""
    i_s = l.find(s_s) + len(s_s)
    s = l[i_s:]
    if s_e != "":
        i_e = s.find(s_e)
        s = s[:i_e]
    return s

def html2txt(s):
    lab = find_str_between(s, '<', '>')
    while lab != '':
        s = s.replace('<%s>' % lab, '')
        lab = find_str_between(s, '<', '>')
    return s

def down_img(href, file):
    s_h = href
    s_f = file

    s_p, s_n = os.path.split(s_f)
    if not os.path.exists(s_p):
        os.makedirs(s_p)

    if not os.path.exists(s_f):
        try:
            r = requests.get(s_h)
            f = open(s_f, "wb")
            f.write(r.content)
            f.close()
        except:
            print("\tError: %s" % s_f)

def getpage(url):
    fn = url.replace("/", "-")
    fn = fn.replace(":", "_")
    path = "%s/temp" % root
    if not os.path.exists(path):
        os.makedirs(path)
    fn = "%s/%s" % (path, fn)
    s = ""
    if os.path.exists(fn):
        f = codecs.open(fn, "r", "utf-8")
        s = f.read()
        f.close()
    else:
        try:
            s = requests.get(url).content.decode("utf-8")
            f = codecs.open(fn, "w", "utf-8")
            f.write(s)
            f.close()
        except:
            print("\terror! %s" % url)
            s = ""
    return s

def downdetail(url):
    p1, t1 = os.path.split(url[:-1])
    path = "%s/RPI-%s" % (root, t1)
    #print(path)
    if not os.path.exists(path):
        os.makedirs(path)

    s = getpage(url)
    if s == "":
        return
    # print(s)
    s = s.replace('<td>\n', '<td>')
    ls = s.split('\n')
    b_detail = False
    b_table = False
    i_c = 0
    for l in ls:
        if '<head>' in l:
            s_txt = ""
            s_inf = ""
        if '<a href="https://www.raspberrypi.org/app/uploads' in l:
            src = find_str_between(l, 'href="', '"')
            p1, f1 = os.path.split(src)
            s_file = '%s/rpi_%s' % (path, f1)
            #print('\t%s' % src)
            #print(s_file)
            down_img(src, s_file)
        if '<p class="product-hero__description">' in l:
            s_txt += '%s\n' % l
        if '</div>' in l:
            b_detail = False
        if b_detail:
            s_txt += l + '\n'
        if '<div class="product-getting-started__inner">' in l:
            b_detail = True
        if '</div>' in l:
            b_table = False
        if b_table:
            s_inf += '%s\n' % l
        if '<div class="product-specifications product-tabs__tab-panel"' in l:
            b_table = True
        if '</html>' in l:
            s_txt = html2txt(s_txt)
            f = codecs.open(path + '/notes.txt', 'w', 'utf-8')
            f.write(s_txt)
            f.close()

            s_inf = html2txt(s_inf)
            f = codecs.open(path + '/detail.txt', 'w', 'utf-8')
            f.write(s_inf)
            f.close()

def down():
    if not os.path.exists(root):
        os.makedirs(root)

    url = home + default
    s = getpage(url)
    if s == "":
        return
    s_txt = ''
    ls = s.split('\n')
    for l in ls:
        #print(l)
        if '<a class="clearfix product-list__item-link"' in l and '</a>' in l:
            tit = find_str_between(l, '">', '<')
            href = find_str_between(l, 'href="', '"')
            s_txt += '%-40s %s\n' % (tit, href)
            print('%-40s %s' % (tit, href))
            downdetail(href)

    f = codecs.open(root + '/index_rpi.txt', 'w', 'utf-8')
    f.write(s_txt)
    f.close()

if __name__ == "__main__":
    down()